# -*- coding: utf-8 -*-

# Scrapy settings for coolscrapy project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
import logging

BOT_NAME = 'coolscrapy'

SPIDER_MODULES = ['coolscrapy.spiders']
NEWSPIDER_MODULE = 'coolscrapy.spiders'

ITEM_PIPELINES = {
    # 'coolscrapy.pipelines.DuplicatesPipeline': 1,
    # 'coolscrapy.pipelines.FilterWordsPipeline': 2,
    # 'coolscrapy.pipelines.JsonWriterPipeline': 3,
    # 'coolscrapy.pipelines.JsonExportPipeline': 4,
    # 'coolscrapy.pipelines.ArticleDataBasePipeline': 5,
    'coolscrapy.pipelines.TobaccoImagePipeline': 6,
    'coolscrapy.pipelines.TobaccoDatabasePipeline': 7,
}
DOWNLOADER_MIDDLEWARES = {
    # 这里是下载中间件
    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
    'coolscrapy.middlewares.RotateUserAgentMiddleware': 400,
    'scrapy_splash.SplashCookiesMiddleware': 723,
    'scrapy_splash.SplashMiddleware': 725,
    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
SPIDER_MIDDLEWARES = {
    # 这是爬虫中间件， 543是运行的优先级
    # 'coolscrapy.middlewares.UrlUniqueMiddleware': 543,
}

# 几个反正被Ban的策略设置
DOWNLOAD_TIMEOUT = 20
DOWNLOAD_DELAY = 5
# 禁用Cookie
COOKIES_ENABLES = True
#COOKIES_DEBUG = True

LOG_LEVEL = logging.INFO
LOG_STDOUT = True
LOG_FILE = "E:/logs/spider.log"
LOG_FORMAT = "%(asctime)s [%(name)s] %(levelname)s: %(message)s"


# windows pip install mysqlclient
# linux pip install MySQL-python
DATABASE = {'drivername': 'mysql',
            'host': '123.207.66.156',
            'port': '3306',
            'username': 'root',
            'password': '******',
            'database': 'test',
            'query': {'charset': 'utf8'}}

# 图片下载设置
IMAGES_STORE = 'E:/logs/'
IMAGES_EXPIRES = 30  # 30天内抓取的都不会被重抓
# 图片链接前缀
URL_PREFIX = 'http://enzhico.net/pics/'

# js异步加载支持
SPLASH_URL = 'http://192.168.203.91:8050'
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'

# 扩展-定义爬取数量
# CLOSESPIDER_ITEMCOUNT = 10

# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'coolscrapy (+http://www.yourdomain.com)'

# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS=32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY=3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN=16
# CONCURRENT_REQUESTS_PER_IP=16

# Disable cookies (enabled by default)
# COOKIES_ENABLED=False

# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED=False

# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
# }

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
#    'coolscrapy.middlewares.MyCustomSpiderMiddleware': 543,
# }

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#    'coolscrapy.middlewares.MyCustomDownloaderMiddleware': 543,
# }

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
# EXTENSIONS = {
#    'scrapy.telnet.TelnetConsole': None,
# }

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
# ITEM_PIPELINES = {
#    'coolscrapy.pipelines.SomePipeline': 300,
# }

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# NOTE: AutoThrottle will honour the standard settings for concurrency and delay
# AUTOTHROTTLE_ENABLED=True
# The initial download delay
# AUTOTHROTTLE_START_DELAY=5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY=60
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG=False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED=True
# HTTPCACHE_EXPIRATION_SECS=0
# HTTPCACHE_DIR='httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES=[]
# HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
